3. Predict Runoff Statistics from Catchment Attributes#

../_images/param_prediction_test_result.png

Fig. 3.1 Example result showing the prediction error after each group of predictors is added, a scatter plot of observed and predicted mean runoff predicted from catchment attributes based on all predictors, the learning curve of training and test sets, and the distribution of target variables.#

3.1. Introduction#

We compute the (ordinary) statistics of runoff time series for a large sample of catchments. We then test the predictability of these statistics from catchment attributes using a gradient boosting machine learning approach. The purpose of testing predictability of the order statistics and L-moments is to fully describe parametric distributions for estimating streamflow distributions (flow duration curves), a common problem in applied hydrology.

Catchment attributes are used as predictors of each order statistic. Attributes are added cumulatively in successive tests to compare the contribution of catchment attribute groups related to climate, terrain, land cover, and soil.

We test if transforming the target variable has an effect on the xgboost model. For transformations that do not change the rank of the target variable, i would not expect to see an effect, however the metric used as the objective function may be sensitive to the distribution of target variables, that is sensitive to outliers. We test the predictability of the following:

Statistic

Description

uar_mean

Mean of the <param> series

uar_median

Median of the <param> series

uar_std

Standard deviation of the <param> series

uar_mad

Mean Absolute Deviation (MAD) of the <param> series

log_<param>_mean

Mean of the log transformed <param> series (mean, median, stdev, MAD)

Notes:

  • uar: unit area runoff is expressed in \(L/s/\text{km}^2\).

import os
import pandas as pd
import numpy as np
from pathlib import Path
import xarray as xr
from scipy.stats import skew, kurtosis
from math import comb

from bokeh.plotting import figure, show
from bokeh.layouts import gridplot, row, column
from bokeh.models import HoverTool, ColumnDataSource, Whisker, Legend, LegendItem
from bokeh.transform import factor_cmap, factor_mark
from bokeh.palettes import Category10, Category20, Category20c, Viridis
from bokeh.io import output_notebook
# from bokeh.palettes import Sunset10, Vibrant7

from sklearn.metrics import r2_score

import data_processing_functions as dpf

from scipy.stats import linregress
output_notebook()

BASE_DIR = os.getcwd()
Loading BokehJS ...

3.2. Load Input Data#

# extra stations to exclude (see Notebook 1: Data)
exclude = ['15056030', '08FA009', '08GA037', '08NC003', '12052500', '12090480', '12107950', '12108450', '12119300',
           '12119450', '12200684', '12200762', '12203000', '12409500', '15056070', '15081510']
# load the catchment characteristics
rev_date = '20250227'
HYSETS_DIR = Path('/home/danbot/code/common_data/HYSETS')

# load camels hydro attributes to compare with the BCUB data
camels_df = pd.read_csv('data/camels/camels_hydro.txt', sep=';')
camels_df['gauge_id'] = camels_df['gauge_id'].astype(str)
camels_df.head()

hs_df = pd.read_csv('data/HYSETS_watershed_properties.txt', sep=';', dtype={'Watershed_ID': int, 'Official_ID': str})

# create dictionaries for quick lookups
da_dict = {row['Official_ID']: row['Drainage_Area_km2'] for _, row in hs_df.iterrows()}
# needed to map between watershed ID in Hysets data and official_ID
watershed_id_dict = {row['Watershed_ID']: row['Official_ID'] for _, row in hs_df.iterrows()}
# and the inverse
official_id_dict = {row['Official_ID']: row['Watershed_ID'] for _, row in hs_df.iterrows()}
def match_with_padding(oid):
    if oid in hs_df['Official_ID'].values:
        return oid
    print(f'{oid} not found in HYSETS data, trying padded versions...')
    for pad in range(1, 4):
        padded = oid.zfill(len(oid) + pad)
        if padded in hs_df['Official_ID'].values:
            print(f'    Found padded version: {padded}')
            return padded
    raise ValueError(f"Official ID {oid} not found in HYSETS data, even with padding.")
attribute_file = f'BCUB_watershed_attributes_updated_{rev_date}.csv'
updated_attribute_file = 'catchment_attributes_with_runoff_stats.csv'
if not os.path.exists(os.path.join('data', updated_attribute_file)):
    print(f'Updated attribute file {updated_attribute_file} not found. Using {attribute_file} instead.')
    updated_attribute_path = os.path.join('data', attribute_file)
    process_statistics = True
else:
    updated_attribute_path = os.path.join(os.getcwd(), 'data', updated_attribute_file)
    process_statistics = False

df = pd.read_csv(updated_attribute_path, dtype={'official_id': str})
df['official_id'] = df['official_id'].apply(lambda x: match_with_padding(x))
df = df[~df['official_id'].isin(exclude)]
df = df[[c for c in df.columns if 'unnamed:' not in c.lower()]]
df.columns = [c.lower() for c in df.columns]
df.sort_values('official_id', inplace=True)
df.reset_index(drop=True, inplace=True)
def load_and_filter_hysets_data(station_ids, hs_df):

    # load the updated HYSETS data
    updated_filename = 'HYSETS_2023_update_QC_stations.nc'
    ds = xr.open_dataset(HYSETS_DIR / updated_filename)

    # Get valid IDs as a NumPy array
    hs_df = hs_df[hs_df['Official_ID'].isin(station_ids)]
    selected_ids = hs_df['Watershed_ID'].values

    # Get boolean index where watershedID in selected_set
    # safely access watershedID as a variable first
    ws_ids = ds['watershedID'].data  # or .values if you prefer
    mask = np.isin(ws_ids, selected_ids)

    # Apply mask to data
    ds = ds.sel(watershed=mask)
    # Step 1: Promote 'watershedID' to a coordinate on the 'watershed' dimension
    ds = ds.assign_coords(watershedID=("watershed", ds["watershedID"].data))

    # Step 2: Set 'watershedID' as the index for the 'watershed' dimension
    return ds.set_index(watershed="watershedID")


def retrieve_timeseries_discharge(stn, ds):
    watershed_id = official_id_dict[stn]
    # drainage_area = self.ctx.da_dict[stn]
    # data = self.ctx.data
    da = da_dict[stn]
    df = ds['discharge'].sel(watershed=str(watershed_id)).to_dataframe(name='discharge').reset_index()
    df = df.set_index('time')[['discharge']]
    df.dropna(inplace=True)
    # clip minimum flow to 1e-4
    df['discharge'] = np.clip(df['discharge'], 1e-4, None)
    df.rename(columns={'discharge': stn}, inplace=True)
    df[f'{stn}_uar'] = 1000 * df[stn] / da
    df[f'{stn}_mm'] = df[stn] * (24 * 3.6 / da)
    df['log_x'] = np.log(df[f'{stn}_uar'])
    return df
ds = load_and_filter_hysets_data(df['official_id'], hs_df)

Filter the dataset for stations meeting minimum record length.

from lmoments3 import distr

def stats(values):
    out = {}
    for label in ['uar', 'log_uar']:
        if label.startswith('log_'):
            values = np.log(values)
        # classical moments
        m   = values.mean()
        median = np.median(values)
        s   = values.std(ddof=1)
        mad = np.mean(np.abs(values - m))
        sk  = pd.Series(values).skew()
        kt  = pd.Series(values).kurtosis()
        # l-moments
        params = distr.gev.lmom_fit(values)

        out.update({
            f'{label}_mean': m,
            f'{label}_median': median,
            f'{label}_mad': mad,
            f'{label}_std': s,
            f'{label}_skew': sk,
            f'{label}_kurt': kt,
            f'{label}_lmom_xi': params['c'],
            f'{label}_lmom_loc': params['loc'],
            f'{label}_lmom_scale': params['scale'],
        })
    return out

# reset the index to ensure the split is done correctly
def process_row(data):
    stn = str(data['official_id'])
    data = retrieve_timeseries_discharge(stn, ds)
    # Compute the runoff statistics
    runoff_data = stats(data[f'{stn}_uar'].values)
    camels_data = camels_df[camels_df['gauge_id'] == stn].copy()
    if len(camels_data) > 1:
        camels_q = camels_data['q_mean'].values[0]
        raise Exception(f'Multiple CAMELS data found for {stn}.')
    else:
        camels_q = camels_data['q_mean'].values[0] if not camels_data.empty else np.nan

    # Merge your existing mm‐based mean + the new metrics
    out = {
      **runoff_data,
      'camels_q_mean_mm': camels_q,
    }
    return pd.Series(out)
# set the minimum number of complete years required
min_record_years = 5

filtered_df = df[df['n_years'] >= min_record_years]

if process_statistics == True:
    print(f'Processing runoff statistics for {len(filtered_df)} stations')
    updated_fpath = os.path.join(os.getcwd(), 'data', f'catchment_attributes_with_runoff_stats.csv')
    stats_results = filtered_df.apply(lambda x: process_row(x), axis=1)
    target_cols = stats_results.columns.tolist()
    filtered_df.loc[stats_results.index, stats_results.columns] = stats_results
    print(f'   Saving updated attributes with runoff statistics for {len(filtered_df)} catchments to:', updated_fpath)
    filtered_df.to_csv(updated_fpath)
target_cols = [
    'uar_mean', 'uar_std', 'uar_median', 'uar_mad',
    'log_uar_mean', 'log_uar_std', 'log_uar_median', 'log_uar_mad',
]

# target_cols += [f'prob_q_lessthan_{threshold}' for threshold in [1e-4, 5e-4, 1e-3, 5e-3, 1e-2]]
assert np.all([c in filtered_df.columns for c in target_cols]), f"Not all target columns found in filtered_df: {target_cols} \n {list(df.columns)}"
from bokeh.models import ColorBar, LogColorMapper, ColumnDataSource

# visualize the distribution of the mean runoff
p1 = figure(title="Mean runoff distribution", width=500, height=300)
hist, edges = np.histogram(filtered_df['uar_mean'], density=True, bins=50)
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color='white')
p1.xaxis.axis_label = 'Mean runoff [L/s/km^2]'
p1.yaxis.axis_label = 'Frequency'

# visualize a scatter plot of the mean and standard deviation
p2 = figure(title=f"Mean vs. Standard deviation (N={len(df)})", width=500, height=300)
# add color mapper to encode drainage_area_km2
mapper = LogColorMapper(palette='Viridis256', low=filtered_df['drainage_area_km2'].min(), 
                        high=filtered_df['drainage_area_km2'].max())
source = ColumnDataSource(filtered_df)
color_bar = ColorBar(color_mapper=mapper, width=8, location=(0,0), title='Drainage area [km²]',)
p2.add_layout(color_bar, 'right')
# add the scatter plot with color mapping
p2.scatter('uar_mean', 'uar_std', source=source, color={'field': 'drainage_area_km2', 'transform': mapper})

x = np.linspace(0, filtered_df['uar_mean'].max(), 100)
slope, intercept, r, pval, se = linregress(filtered_df['uar_mean'].values, filtered_df['uar_std'].values)
y = [slope*e + intercept for e in x]
p2.line(x, y, legend_label=f'L2: y={slope:.2f}x + {intercept:.2f} (R²={r**2:.2f})', 
       line_width=2, color='red', line_dash='dashed')

# plot the L1 Norm -- L2 is sensitive to outliers and is biased for low mean values
# l1_slope, l1_intercept = L1_fit_line(df['mean_uar'].values, df['sd_uar'].values)
# yl1 = [l1_slope*e + l1_intercept for e in x]
# p2.line(x, yl1, legend_label=f'L1: y={l1_slope:.2f}x + {l1_intercept:.2f}', 
#        line_width=2, color='red', line_dash='dotted')
p2.line([0, filtered_df['uar_mean'].max()], [0, filtered_df['uar_mean'].max()], 
        line_width=2, color='black', line_dash='dotted', legend_label='1:1')

# add hover tool to show official_id
p2.add_tools(HoverTool(tooltips=[('Official ID', '@official_id')]))
# p2.xaxis.axis_label = 'Mean runoff [mm/day]'
p2.xaxis.axis_label = 'Mean runoff [L/s/km^2]'
p2.yaxis.axis_label = 'Standard deviation'
p2.legend.location = 'top_left'
p2.legend.background_fill_alpha = 0.5
p2.legend.click_policy = 'hide'
show(row(p1, p2))

3.3. Define attribute groups#

terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'aspect_deg']
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
soil = ['logk_ice_x100', 'porosity_x100']

all_attributes = terrain + land_cover + soil + climate
assert len([c for c in all_attributes if c not in df.columns]) == 0
len(all_attributes)
24
results_folder = os.path.join(BASE_DIR, 'data', 'results', 'parameter_prediction_results')
if not os.path.exists(results_folder):
    os.makedirs(results_folder)
nfolds = 5
n_boost_rounds = 500
n_optimization_rounds = 20
loss = 'reg:squarederror'
# loss = 'reg:absoluteerror'

all_test_results = {}
attribute_set_dict = {
    'climate': climate, 
    '+land_cover': land_cover,
    '+terrain': terrain, 
    '+soil': soil,
}

3.4. Set Attribute Groupings#

group_1 = ['climate', '+terrain', '+land_cover', '+soil']
# for group 2, just reverse group 1
group_2 = group_1[::-1]
group_3 = ['+land_cover', '+terrain', '+soil', 'climate']
group_4 = ['+soil', 'climate', '+land_cover', '+terrain']
attribute_group_orders = [group_1, group_2, group_3, group_4]

3.5. Run XGBoost Models#

Separate the test set at the outset so the attribute group ordering is tested on the same hold-out set but necessarily on unique training optimizations. This ensures that at least the presence of outliers in the hold-out set should at least be constant across the attribute group reordering.

from xgb_functions import run_xgb_CV_trials

def predict_runoff_from_attributes(df, target_column, group_order, results_folder, n_boost_rounds, n_optimization_rounds, loss):
    """
    Note that here we're predicting the log mean unit area runoff
    """
    # set the target column
    predictor_attributes = []
    results = {}
    # add attribute groups successively
    for set_name in group_order:
        print(f' Processing {set_name} attribute set')
        predictor_attributes += attribute_set_dict[set_name] 
        input_data = df[['official_id'] + predictor_attributes + [target_column]].copy()
        result_df, all_predictions_df, all_convergence_df = run_xgb_CV_trials(
            set_name, predictor_attributes, target_column, input_data, 
            n_optimization_rounds, nfolds, n_boost_rounds, results_folder, 
            loss=loss,
        )
        # store the test set predictions and actuals
        results[set_name] = {
            'all_results': result_df,
            'convergence': all_convergence_df,
            'oos_predictions': all_predictions_df,
        } 
    return results
results_dict = {}
group_results_dict = {}
eval_metrics = {'reg:squarederror': 'test_rmse', 'reg:absoluteerror': 'test_mae'}
for target_col in target_cols:
    n  = 0
    min_error = 1e9    
    best_set = None
    results_dict[target_col] = {}
    group_results_dict[target_col] = {}
    print(f'TARGET: {target_col}')
    eval_metric = eval_metrics[loss]
    
    for group in attribute_group_orders:
        n += 1
        print(f'Processing: {group} ordering. {n}/{len(attribute_group_orders)}.')
        
        group_results_fname = f'{target_col}_prediction_results_{"".join(group)}.npy'
        group_results_fpath = os.path.join(results_folder, group_results_fname)

        print(f'Group results file: {group_results_fpath}')
        
        if os.path.exists(group_results_fpath):
            print(f'Retrieving existing results from {group_results_fpath}')
            group_results = np.load(group_results_fpath, allow_pickle=True).item()
        else:
            group_results = predict_runoff_from_attributes(filtered_df, target_col, group, results_folder, n_boost_rounds, n_optimization_rounds, loss)
            np.save(group_results_fpath, group_results)
    
        group_results_dict[target_col][n] = {'order': group, 'results': group_results}
        for k, test_data in group_results.items():
            # Save all results, not just the best
            results_dict[target_col][f'group_{n}'] = {
                'group_order': group,
                'test_error': test_data['all_results'][f'{eval_metric}_mean'],
                'test_error_std': test_data['all_results'][f'{eval_metric}_stdev'],
                'convergence': test_data['convergence'],
                'oos_predictions': test_data['oos_predictions'],
            }
            # print(f'Group {n} {k} {eval_metric}: {results_dict[target_col][f"group_{n}"]["test_error"]}')
            summary_csv = f'{target_col}_summary_group_{n}.csv'
            test_data['oos_predictions'].to_csv(os.path.join(results_folder, summary_csv), index=False)
TARGET: uar_mean
Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering. 1/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mean_prediction_results_climate+terrain+land_cover+soil.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mean_prediction_results_climate+terrain+land_cover+soil.npy
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering. 2/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mean_prediction_results_+soil+land_cover+terrainclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mean_prediction_results_+soil+land_cover+terrainclimate.npy
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering. 3/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mean_prediction_results_+land_cover+terrain+soilclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mean_prediction_results_+land_cover+terrain+soilclimate.npy
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering. 4/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mean_prediction_results_+soilclimate+land_cover+terrain.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mean_prediction_results_+soilclimate+land_cover+terrain.npy
TARGET: uar_std
Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering. 1/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_std_prediction_results_climate+terrain+land_cover+soil.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_std_prediction_results_climate+terrain+land_cover+soil.npy
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering. 2/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_std_prediction_results_+soil+land_cover+terrainclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_std_prediction_results_+soil+land_cover+terrainclimate.npy
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering. 3/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_std_prediction_results_+land_cover+terrain+soilclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_std_prediction_results_+land_cover+terrain+soilclimate.npy
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering. 4/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_std_prediction_results_+soilclimate+land_cover+terrain.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_std_prediction_results_+soilclimate+land_cover+terrain.npy
TARGET: uar_median
Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering. 1/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_median_prediction_results_climate+terrain+land_cover+soil.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_median_prediction_results_climate+terrain+land_cover+soil.npy
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering. 2/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_median_prediction_results_+soil+land_cover+terrainclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_median_prediction_results_+soil+land_cover+terrainclimate.npy
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering. 3/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_median_prediction_results_+land_cover+terrain+soilclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_median_prediction_results_+land_cover+terrain+soilclimate.npy
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering. 4/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_median_prediction_results_+soilclimate+land_cover+terrain.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_median_prediction_results_+soilclimate+land_cover+terrain.npy
TARGET: uar_mad
Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering. 1/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mad_prediction_results_climate+terrain+land_cover+soil.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mad_prediction_results_climate+terrain+land_cover+soil.npy
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering. 2/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mad_prediction_results_+soil+land_cover+terrainclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mad_prediction_results_+soil+land_cover+terrainclimate.npy
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering. 3/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mad_prediction_results_+land_cover+terrain+soilclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mad_prediction_results_+land_cover+terrain+soilclimate.npy
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering. 4/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mad_prediction_results_+soilclimate+land_cover+terrain.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/uar_mad_prediction_results_+soilclimate+land_cover+terrain.npy
TARGET: log_uar_mean
Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering. 1/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mean_prediction_results_climate+terrain+land_cover+soil.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mean_prediction_results_climate+terrain+land_cover+soil.npy
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering. 2/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mean_prediction_results_+soil+land_cover+terrainclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mean_prediction_results_+soil+land_cover+terrainclimate.npy
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering. 3/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mean_prediction_results_+land_cover+terrain+soilclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mean_prediction_results_+land_cover+terrain+soilclimate.npy
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering. 4/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mean_prediction_results_+soilclimate+land_cover+terrain.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mean_prediction_results_+soilclimate+land_cover+terrain.npy
TARGET: log_uar_std
Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering. 1/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_std_prediction_results_climate+terrain+land_cover+soil.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_std_prediction_results_climate+terrain+land_cover+soil.npy
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering. 2/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_std_prediction_results_+soil+land_cover+terrainclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_std_prediction_results_+soil+land_cover+terrainclimate.npy
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering. 3/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_std_prediction_results_+land_cover+terrain+soilclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_std_prediction_results_+land_cover+terrain+soilclimate.npy
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering. 4/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_std_prediction_results_+soilclimate+land_cover+terrain.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_std_prediction_results_+soilclimate+land_cover+terrain.npy
TARGET: log_uar_median
Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering. 1/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_median_prediction_results_climate+terrain+land_cover+soil.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_median_prediction_results_climate+terrain+land_cover+soil.npy
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering. 2/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_median_prediction_results_+soil+land_cover+terrainclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_median_prediction_results_+soil+land_cover+terrainclimate.npy
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering. 3/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_median_prediction_results_+land_cover+terrain+soilclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_median_prediction_results_+land_cover+terrain+soilclimate.npy
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering. 4/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_median_prediction_results_+soilclimate+land_cover+terrain.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_median_prediction_results_+soilclimate+land_cover+terrain.npy
TARGET: log_uar_mad
Processing: ['climate', '+terrain', '+land_cover', '+soil'] ordering. 1/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mad_prediction_results_climate+terrain+land_cover+soil.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mad_prediction_results_climate+terrain+land_cover+soil.npy
Processing: ['+soil', '+land_cover', '+terrain', 'climate'] ordering. 2/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mad_prediction_results_+soil+land_cover+terrainclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mad_prediction_results_+soil+land_cover+terrainclimate.npy
Processing: ['+land_cover', '+terrain', '+soil', 'climate'] ordering. 3/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mad_prediction_results_+land_cover+terrain+soilclimate.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mad_prediction_results_+land_cover+terrain+soilclimate.npy
Processing: ['+soil', 'climate', '+land_cover', '+terrain'] ordering. 4/4.
Group results file: /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mad_prediction_results_+soilclimate+land_cover+terrain.npy
Retrieving existing results from /home/danbot/code/distribution_estimation/docs/notebooks/data/results/parameter_prediction_results/log_uar_mad_prediction_results_+soilclimate+land_cover+terrain.npy
label_dict = {
    'uar_mean': {'x': r'$$\mu_i \left[ L s^{-1} \text{km}^{-2}\right]$$', 'y': r'$$\hat \mu_i \left[L s^{-1} \text{km}^{-2} \right]$$'}, 
    'uar_std': {'x': r'$$\sigma_i \left[ L s^{-1} \text{km}^{-2}\right]$$', 'y': r'$$\hat \sigma_i \left[ L s^{-1} \text{km}^{-2} \right]$$'}, 
    'uar_median': {'x': r'$$\text{Median}_i \left[ L s^{-1} \text{km}^{-2}\right]$$', 'y': r'$$ \text{Median}_\text{est} \left[ L s^{-1} \text{km}^{-2} \right]$$'}, 
    'uar_mad': {'x': r'$$\text{MAD}_i \left[ L s^{-1} \text{km}^{-2}\right]$$', 'y': r'$$\text{MAD}_\text{est} \left[ L s^{-1} \text{km}^{-2} \right]$$'}, 
    'log_uar_mean': {'x': r'$$\log \mu \left[ L s^{-1} \text{km}^{-2} \right]$$', 'y': r'$$ \log \hat \mu  \left[ L s^{-1} \text{km}^{-2} \right]$$'}, 
    'log_uar_median': {'x': r'$$\log \text{Median} \left[ L s^{-1} \text{km}^{-2} \right]$$', 'y': r'$$ \text{Log Median}_\text{est}  \left[ L s^{-1} \text{km}^{-2} \right]$$'}, 
    'log_uar_std': {'x': r'$$\log\ \sigma_i \left[ L s^{-1} \text{km}^{-2} \right]$$', 'y': r'$$\log \hat \sigma_i  \left[ L s^{-1} \text{km}^{-2} \right]$$'}, 
    'log_uar_mad': {'x': r'$$\log\ \text{MAD}_i \left[ L s^{-1} \text{km}^{-2} \right]$$', 'y': r'$$ \text{Log MAD}_\text{est} \left[ L s^{-1} \text{km}^{-2} \right]$$'},
}

target_cols = [
    'uar_mean', 'uar_std', 'uar_median', 'uar_mad',
    'log_uar_mean', 'log_uar_std', 'log_uar_median', 'log_uar_mad',
]

3.6. View Results#

def create_results_plots(target_col, results_df, attribute_sets, eval_metric, title=''):    
    plots = []
    def _plot_attribute_order_line():
        means, lbs, ubs = [], [], []

        for e in attribute_sets:
            df = results_df[e]['all_results']
            trial_means = df[f'{eval_metric}_mean'].values
            means.append(np.mean(trial_means))
            lb, ub = np.percentile(trial_means, [2.5, 97.5])
            lbs.append(lb)
            ubs.append(ub)

        max_range = (0.95*min(lbs), max(ubs)*1.05)
        group_order = [e if not e.startswith('+') else e[1:] for e in attribute_sets]
        group_order = group_order[:1] + [f'+{e}' for e in group_order[1:]]  # add '+' to all but the first element
        source = ColumnDataSource({'x': group_order, 'y1': means, 'ub': ubs, 'lb': lbs})
    
        fig = figure(title='', x_range=group_order, y_range=max_range)#y_range=max_range)
        fig.scatter('x', 'y1', legend_label=eval_metric.upper(), color='green', source=source, line_width=3, marker='square', size=4)
        fig.add_layout(Whisker(source=source, base='x', upper='ub', lower='lb', line_width=1))
        fig.legend.background_fill_alpha = 0.6
        fig.yaxis.axis_label = r'$$\text{RMSE}$$'
        fig.xaxis.axis_label = r'$$\text{Attribute Groups}$$'
        best_set = min(attribute_sets, key=lambda x: results_df[x]['all_results'][f'{eval_metric}_mean'].mean())
        return fig, best_set


    def _plot_scatter_with_regression(best_result_df, xlabel, ylabel, target_col):
        trial_r2 = (
            best_result_df.groupby('trial')[['actual', 'predicted']]
            .apply(lambda df: r2_score(df['actual'], df['predicted']))
        )
        # r2_mean = trial_r2.mean()
        r2_std = trial_r2.std()
        grouped = best_result_df.groupby('trial').agg({
            'actual': 'median',
            'predicted': 'median',
        })
        grouped['diff'] = (grouped['actual'] - grouped['predicted']).abs()
        median_trial = grouped.sort_values('diff').index[len(grouped) // 2]
        best_result = best_result_df[best_result_df['trial'] == median_trial].copy()
        xx, yy = best_result['actual'].values, best_result['predicted'].values
        source = ColumnDataSource({'x': xx, 'y': yy, 'ID': best_result['official_id'].values})
        slope, intercept, r, p, se = linregress(xx, yy)

        sfig = figure(title='')
        sfig.scatter('x', 'y', size=3, alpha=0.8, source=source, legend_label=target_col, color='blue')
        sfig.add_tools(HoverTool(tooltips=[('ID', '@ID')]))
        x_obs = np.linspace(min(xx), max(xx), 1000)
        ybf = [slope * e + intercept for e in x_obs]
        sfig.line(x_obs, ybf, color='red', line_width=3, line_dash='dashed', legend_label=f'R²={r**2:.2f} ± {r2_std:.3f}')
        sfig.line([0, max(ybf)], [0, max(ybf)], color='black', line_dash='dotted', line_width=2, legend_label='1:1')
        sfig.xaxis.axis_label = xlabel
        sfig.yaxis.axis_label = ylabel
        sfig.legend.background_fill_alpha = 0.5
        sfig.legend.location = 'top_left'
        return sfig, median_trial

    def _plot_convergence(convergence_df, median_trial):
        cfig = figure(title='')

        data = convergence_df[convergence_df['trial'] == median_trial].copy()
        fold_nos = sorted(set(convergence_df['fold']))

        min_pred_risk = 1e9
        for fn in fold_nos:
            fold_data = data[data['fold'] == fn].copy()
            cfig.line(fold_data['round'], fold_data[f'test'], line_alpha=0.6, line_color='red', line_dash='dotted', legend_label=f'Test Folds')
            cfig.line(fold_data['round'], fold_data[f'train'], line_alpha=0.7, line_color='grey', line_dash='dotted', legend_label=f'Train Folds')

        train_pivot = data.pivot_table(index='round', columns='fold', values='train')#, aggfunc='first')
        test_pivot = data.pivot_table(index='round', columns='fold', values='test')#, aggfunc='first')
        train_pivot.columns = [f'fold_{col}' for col in train_pivot.columns]
        test_pivot.columns = [f'fold_{col}' for col in test_pivot.columns]

        train_pivot['mean'] = train_pivot.mean(axis=1)
        test_pivot['mean'] = test_pivot.mean(axis=1)
        cfig.line(train_pivot.index, train_pivot['mean'], line_alpha=0.5, line_color='grey', line_width=2, legend_label='CV Mean (Train)')
        cfig.line(test_pivot.index, test_pivot['mean'], line_alpha=0.5, line_color='red', line_width=2, legend_label='CV Mean (Test)')

        min_pred_risk_idx = test_pivot['mean'].idxmin()
        min_pred_risk = test_pivot.loc[min_pred_risk_idx, 'mean']
        if min_pred_risk_idx == max(test_pivot['mean'].index):
            print(f'Min prediction risk occurs at the maximum iteration, try increasing the number of boosting rounds')
        cfig.line([min_pred_risk_idx, min_pred_risk_idx], [0, min_pred_risk], legend_label='Min risk', color='green', line_width=2, line_dash='dashed')
        cfig.legend.location = 'top_right'
        cfig.legend.background_fill_alpha = 0.5
        cfig.xaxis.axis_label = r'$$\text{Iteration}$$'
        cfig.yaxis.axis_label = r'$$\text{RMSE}$$'
        return cfig

    def _plot_target_cdfs(cdf_arrays, xlabel):
        cdffig = figure(title='', x_axis_type='log')
        for (cdfx, cdfy) in cdf_arrays:
            cdffig.line(cdfx, cdfy, color='black', line_alpha=0.6, line_width=2, legend_label='Fold CDFs')
        cdffig.xaxis.axis_label = xlabel
        cdffig.yaxis.axis_label = r'$$\text{Pr}(X\leq x) $$'
        cdffig.legend.location = 'top_left'
        return cdffig
    
    def _compute_empirical_cdf(data):
        sorted_data = np.sort(data)
        cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
        return sorted_data, cdf

    xlabel = label_dict[target_col]['x']
    ylabel = label_dict[target_col]['y']

    # Plot 1: RMSE/MAE across attribute sets
    attr_fig, best_attr_set = _plot_attribute_order_line()
    # attr_fig = _plot_attribute_order_cdf()
    plots.append(attr_fig)

    # Plot 2: Actual vs Predicted for best set
    best_attr = [e for e in results_df.keys() if e.endswith(best_attr_set)][0]
    best_result = results_df[best_attr]['oos_predictions']
    sfig, median_trial = _plot_scatter_with_regression(best_result, xlabel, ylabel, target_col)
    plots.append(sfig)

    # Plot 3: Convergence plot
    convergence_df = results_df[best_attr]['convergence']
    cfig = _plot_convergence(convergence_df, median_trial)
    plots.append(cfig)

    # Plot 4: CDFs
    cdf_arrays = []
    for i, grp_result in results_df[best_attr]['oos_predictions'].groupby('fold'):
        sorted_data, cdf = _compute_empirical_cdf(grp_result['actual'].values)
        cdf_arrays.append((sorted_data, cdf))
    cdffig = _plot_target_cdfs(cdf_arrays, xlabel)
    plots.append(cdffig)

    return plots

In the sequence of plots below, we change the order that groups of attributes are added to training.

We split the CF folds using a stratified approach to balance the target variable distribution in each fold (right-most plot).

The plot 3rd from left shows the objective score at each iteration to show how the training and test sets compare. We don’t want to continue training when the test sets are not improving.

# target_ranges = {'mean_uar': (14, 35), 'sd_uar': (20, 45),
#                 'mean_logx': (0.45, 1.25), 'sd_logx': (0.4, 0.5)}
# target_ranges = {'mean_uar': (14, 35), 'sd_uar': (20, 45),
#                 'mean_logx': (0.45, 1.25), 'sd_logx': (0.4, 0.5)}
n = 4
all_plots = []
for c in target_cols:
    data = group_results_dict[c][n]
    eval_metric = eval_metrics[loss]
    grp_plots = create_results_plots(c, data['results'], data['order'], eval_metric)
    all_plots += grp_plots

layout = gridplot(all_plots, ncols=4, width=300, height=275)
show(layout)
n = 3
all_plots = []
for c in target_cols:
    data = group_results_dict[c][n]
    eval_metric = eval_metrics[loss]
    grp_plots = create_results_plots(c, data['results'], data['order'], eval_metric)
    all_plots += grp_plots

layout = gridplot(all_plots, ncols=4, width=300, height=275)
show(layout)
Min prediction risk occurs at the maximum iteration, try increasing the number of boosting rounds
Min prediction risk occurs at the maximum iteration, try increasing the number of boosting rounds
n = 2
all_plots = []
for c in target_cols:
    data = group_results_dict[c][n]
    eval_metric = eval_metrics[loss]
    grp_plots = create_results_plots(c, data['results'], data['order'], eval_metric)
    all_plots += grp_plots

layout = gridplot(all_plots, ncols=4, width=300, height=275)
show(layout)
Min prediction risk occurs at the maximum iteration, try increasing the number of boosting rounds
n = 1
all_plots = []
for c in target_cols:
    data = group_results_dict[c][n]
    eval_metric = eval_metrics[loss]
    grp_plots = create_results_plots(c, data['results'], data['order'], eval_metric)
    all_plots += grp_plots

layout = gridplot(all_plots, ncols=4, width=300, height=275)
show(layout)
Min prediction risk occurs at the maximum iteration, try increasing the number of boosting rounds
Min prediction risk occurs at the maximum iteration, try increasing the number of boosting rounds

3.6.1. Test the sensitivity to Order of attribute groups#

Note that in the first plot (at left) the attribute groups are additive. That is, the GBM is trained on just the first group listed in the categorical x-axis, then the second group attributes are added to the first, then the third group is added to the first and second, and so forth.

3.6.2. Test randomly permuted target values#

As a last iteration, randomize the order of the mean_runoff column to test what the algorithm is learning.

The predictive power decreases substantially across all groupings of input attributes.

random_results = {}
group_order = group_1
for tc in target_cols:    
    print(f'Processing randomization for target column: {tc}')
    test_results_fname = f'{tc}_prediction_results_shuffled.npy'
    test_results_fpath = os.path.join(results_folder, test_results_fname)
    
    if os.path.exists(test_results_fpath):

        shuffled_test_results = np.load(test_results_fpath, allow_pickle=True).item()
    else:
        shuffled_df = filtered_df.copy()
        for attr in all_attributes:
            # randomly shuffle the order of attribute values
            attr_values = filtered_df[attr].values
            np.random.shuffle(attr_values)
            shuffled_df[attr] = attr_values
        
        shuffled_test_results = predict_runoff_from_attributes(shuffled_df, tc, group_order, results_folder, n_boost_rounds, n_optimization_rounds, loss)
        np.save(test_results_fpath, shuffled_test_results)

    random_results[tc] = {'order': group_order, 'results': shuffled_test_results}
Processing randomization for target column: uar_mean
Processing randomization for target column: uar_std
Processing randomization for target column: uar_median
 Processing climate attribute set
---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
Cell In[24], line 19
     16         np.random.shuffle(attr_values)
     17         shuffled_df[attr] = attr_values
---> 19     shuffled_test_results = predict_runoff_from_attributes(shuffled_df, tc, group_order, results_folder, n_boost_rounds, n_optimization_rounds, loss)
     20     np.save(test_results_fpath, shuffled_test_results)
     22 random_results[tc] = {'order': group_order, 'results': shuffled_test_results}

Cell In[16], line 15, in predict_runoff_from_attributes(df, target_column, group_order, results_folder, n_boost_rounds, n_optimization_rounds, loss)
     13 predictor_attributes += attribute_set_dict[set_name] 
     14 input_data = df[['official_id'] + predictor_attributes + [target_column]].copy()
---> 15 result_df, all_predictions_df, all_convergence_df = run_xgb_CV_trials(
     16     set_name, predictor_attributes, target_column, input_data, 
     17     n_optimization_rounds, nfolds, n_boost_rounds, results_folder, 
     18     loss=loss,
     19 )
     20 # store the test set predictions and actuals
     21 results[set_name] = {
     22     'all_results': result_df,
     23     'convergence': all_convergence_df,
     24     'oos_predictions': all_predictions_df,
     25 } 

File ~/code/distribution_estimation/docs/notebooks/xgb_functions.py:90, in run_xgb_CV_trials(set_name, features, target, input_data, n_optimization_rounds, nfolds, num_boost_rounds, results_folder, loss, random_seed)
     87 Y_train, Y_test = Y_input[train_idx], Y_input[test_idx]
     88 test_ids = input_data.iloc[test_idx]['official_id'].values
---> 90 preds, train_perf, test_perf, eval_key = _train_single_fold(
     91     X_train, X_test, Y_train, Y_test, params.copy(), loss, num_boost_rounds,
     92 )
     94 best_test_round = np.argmin(test_perf)
     95 fold_perfs.append(test_perf[best_test_round])

File ~/code/distribution_estimation/docs/notebooks/xgb_functions.py:29, in _train_single_fold(X_train, X_test, Y_train, Y_test, params, loss, num_boost_rounds)
     27     dtest = xgb.QuantileDMatrix(X_test, label=Y_test)
     28 else:
---> 29     dtrain = xgb.DMatrix(X_train, label=Y_train)
     30     dtest = xgb.DMatrix(X_test, label=Y_test)
     32 evals_result = {}

File ~/code/data_analysis/lib/python3.12/site-packages/xgboost/core.py:729, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    727 for k, arg in zip(sig.parameters, args):
    728     kwargs[k] = arg
--> 729 return func(**kwargs)

File ~/code/data_analysis/lib/python3.12/site-packages/xgboost/core.py:897, in DMatrix.__init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, group, qid, label_lower_bound, label_upper_bound, feature_weights, enable_categorical, data_split_mode)
    894 assert handle is not None
    895 self.handle = handle
--> 897 self.set_info(
    898     label=label,
    899     weight=weight,
    900     base_margin=base_margin,
    901     group=group,
    902     qid=qid,
    903     label_lower_bound=label_lower_bound,
    904     label_upper_bound=label_upper_bound,
    905     feature_weights=feature_weights,
    906 )
    908 if feature_names is not None:
    909     self.feature_names = feature_names

File ~/code/data_analysis/lib/python3.12/site-packages/xgboost/core.py:729, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    727 for k, arg in zip(sig.parameters, args):
    728     kwargs[k] = arg
--> 729 return func(**kwargs)

File ~/code/data_analysis/lib/python3.12/site-packages/xgboost/core.py:961, in DMatrix.set_info(self, label, weight, base_margin, group, qid, label_lower_bound, label_upper_bound, feature_names, feature_types, feature_weights)
    958 from .data import dispatch_meta_backend
    960 if label is not None:
--> 961     self.set_label(label)
    962 if weight is not None:
    963     self.set_weight(weight)

File ~/code/data_analysis/lib/python3.12/site-packages/xgboost/core.py:1099, in DMatrix.set_label(self, label)
   1090 """Set label of dmatrix
   1091 
   1092 Parameters
   (...)   1095     The label information to be set into DMatrix
   1096 """
   1097 from .data import dispatch_meta_backend
-> 1099 dispatch_meta_backend(self, label, "label", "float")

File ~/code/data_analysis/lib/python3.12/site-packages/xgboost/data.py:1586, in dispatch_meta_backend(matrix, data, name, dtype)
   1584     return
   1585 if _is_np_array_like(data):
-> 1586     _meta_from_numpy(data, name, dtype, handle)
   1587     return
   1588 if _is_arrow(data):

File ~/code/data_analysis/lib/python3.12/site-packages/xgboost/data.py:1533, in _meta_from_numpy(data, field, dtype, handle)
   1531     raise ValueError("Masked array is not supported.")
   1532 interface_str = array_interface(data)
-> 1533 _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))

File ~/code/data_analysis/lib/python3.12/site-packages/xgboost/core.py:310, in _check_call(ret)
    299 """Check the return value of C API call
    300 
    301 This function will raise exception when error occurs.
   (...)    307     return value from API calls
    308 """
    309 if ret != 0:
--> 310     raise XGBoostError(py_str(_LIB.XGBGetLastError()))

XGBoostError: [00:00:07] /workspace/src/data/array_interface.cu:44: Check failed: err == cudaGetLastError() (0 vs. 2) : 
Stack trace:
  [bt] (0) /home/danbot/code/data_analysis/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x2a6ecc) [0x71ba378a6ecc]
  [bt] (1) /home/danbot/code/data_analysis/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0xb862dc) [0x71ba381862dc]
  [bt] (2) /home/danbot/code/data_analysis/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x565ece) [0x71ba37b65ece]
  [bt] (3) /home/danbot/code/data_analysis/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0x10b) [0x71ba377b3e5b]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7b16) [0x71bae5b1cb16]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x43ef) [0x71bae5b193ef]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.8(ffi_call+0x12e) [0x71bae5b1c0be]
  [bt] (7) /usr/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0xe11c) [0x71bae5b8a11c]
  [bt] (8) /usr/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0x92af) [0x71bae5b852af]

3.6.3. View results of shuffled target variable (mean runoff)#

all_shuffled_plots = []
for tc in target_cols:
    shuffled_results = random_results[tc]['results'].copy()
    group_order = random_results[tc]['order']
    shuffled_runoff_plots = create_results_plots(tc, shuffled_results, group_order, eval_metric, title='')
    all_shuffled_plots += shuffled_runoff_plots

layout = gridplot(all_shuffled_plots, ncols=4, width=300, height=275)
show(layout)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[59], line 3
      1 all_shuffled_plots = []
      2 for tc in target_cols:
----> 3     shuffled_results = random_results[tc]['results'].copy()
      4     group_order = random_results[tc]['order']
      5     shuffled_runoff_plots = create_results_plots(tc, shuffled_results, group_order, eval_metric, title='')

KeyError: 'uar_median'
# process the results into a dictionary object for easy access to predicted values
mean_predictions = []
for tc in target_cols:
    group_results = group_results_dict[tc]
    group, group_data = 1, group_results[1]
    eval_group = '+land_cover' # evaluate on climate + terrain + land_cover set (neglect soil)
    data = group_data['results'][eval_group]
    oos_predictions = (
        data['oos_predictions']
        .groupby('official_id')['predicted']
        .mean()
        .to_frame()
    )
    # make a dict of official_id: actual from data['oos_predictions']
    actual_dict = data['oos_predictions'].set_index('official_id')['actual'].to_dict()
    # store the predictions in a dataframe
    oos_predictions.rename(columns={'predicted': f'{tc}_mean_predicted'}, inplace=True)
    oos_predictions = oos_predictions[[f'{tc}_mean_predicted']]
    oos_predictions[f'{tc}_actual'] = oos_predictions.index.map(actual_dict)
    mean_predictions.append(oos_predictions)

mean_predictions_df = pd.concat(mean_predictions, axis=1)
mean_predictions_df.reset_index(inplace=True)
mean_predictions_df.to_csv(os.path.join(results_folder, 'mean_parameter_predictions.csv'), index=False)

3.7. Discussion#

  • Reordering the attribute groupings suggests there are interactions between attributes in model training.

  • Across all orderings, the climate attributes provide most predictive information,

  • Soil attributes contribute little or no explanatory power to the model.

  • Terrain and land cover attributes provide some predictive information over soil, but the joint entropy with climate seems to represent much of this information gain,

  • Randomly permuting the order of the target variable, mean_runoff erases all predictive power.

3.8. Citations#